1 Introduction

The objective of this notebook is to refine the clustering annotation done at level 3. This refinement is the result of a manual curation carried out by specialists to remove poor quality cells, misclassified cells or clusters with very few cells.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(Signac)
library(tidyverse)
library(reshape2)
library(ggpubr)
library(harmony)

2.2 Parameters

cell_type = "CD4_T"

# Paths
path_to_obj <- str_c(
  here::here("scATAC-seq/results/R_objects/level_3/"),
  cell_type,
  "/",
  cell_type,
  "_integrated_level_3.rds",
  sep = ""
)

path_to_obj_RNA <- str_c(
  here::here("scRNA-seq/3-clustering/4-level_4/"),
  cell_type,
    "/",
  cell_type,
  "_integrated_level_4.rds",
  sep = ""
)

# Functions
source(here::here("scRNA-seq/bin/utils.R"))


# Colors
color_palette <-  c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", 
                    "#5A5156", "#AA0DFE", "#F8A19F", "#F7E1A0", 
                    "#1C8356", "#FEAF16", "#822E1C", "#C4451C", 
                    "#1CBE4F", "#325A9B", "#F6222E", "#FE00FA", 
                    "#FBE426", "#16FF32", "black", "#3283FE",
                    "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
                    "#90AD1C", "#FE00FA", "#85660D", "#3B00FB",
                    "#822E1C", "coral2", "#1CFFCE", "#1CBE4F",
                    "#3283FE", "#FBE426", "#F7E1A0", "#325A9B",   
                    "#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD",
                    "#FEAF16", "#683B79", "#B10DA1", "#1C7F93", 
                    "#F8A19F", "dark orange", "#FEAF16", "#FBE426",  
                    "Brown")

path_to_level_4 <- here::here("scATAC-seq/results/R_objects/level_4/CD4_T/")
path_to_save <- str_c(path_to_level_4, "CD4_T_integrated_level_4.rds")

2.3 Load data

# Seurat object
seurat <- readRDS(path_to_obj)
seurat
## An object of class Seurat 
## 270784 features across 21819 samples within 1 assay 
## Active assay: peaks_macs (270784 features, 210444 variable features)
##  3 dimensional reductions calculated: umap, lsi, harmony
seurat_RNA <- readRDS(path_to_obj_RNA)
seurat_RNA
## An object of class Seurat 
## 37378 features across 65461 samples within 1 assay 
## Active assay: RNA (37378 features, 0 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony

2.4 Visualization of the data

DimPlot(seurat,
  pt.size = 0.1, split.by = "assay") 

p1 <- DimPlot(seurat,
  pt.size = 0.1) + NoLegend()

p2 <- DimPlot(seurat_RNA,
  group.by = "annotation_level_3",
  pt.size = 0.1,cols = color_palette)

p1 + p2

3 Spot potential doublets & poor-quality cells from Multiome experiment

3.1 From scRNA assay

tonsil_RNA_annotation <- seurat_RNA@meta.data %>%
  rownames_to_column(var = "cell_barcode") %>%
  dplyr::filter(assay == "multiome") %>%
  dplyr::select("cell_barcode", "annotation_level_3")

3.2 From scATAC assay

tonsil_ATAC_cell_barcode <- seurat@meta.data %>%
  rownames_to_column(var = "cell_barcode") %>%
  dplyr::filter(assay == "multiome") %>%
  dplyr::select("cell_barcode")
possible_doublets_ATAC <- setdiff(tonsil_ATAC_cell_barcode$cell_barcode,tonsil_RNA_annotation$cell_barcode)
seurat$quality_cells <- ifelse(colnames(seurat) %in% possible_doublets_ATAC, "Poor-quality", "Good-quality")

3.2.1 Visualize the projection of the problematic cells onto the scATAC-seq UMAP

DimPlot(
  seurat, group.by = "quality_cells")

3.3 Cell-type definition

metabolic_poor_quality <- tonsil_RNA_annotation[tonsil_RNA_annotation$annotation_level_3 == "metabolic/poor-quality",]$cell_barcode

DimPlot(
  seurat, 
  cells.highlight = metabolic_poor_quality)

3.4 Scrublet prediction

DimPlot(seurat, group.by = "scrublet_predicted_doublet_atac")

table(seurat$scrublet_predicted_doublet_atac)
## 
## FALSE  TRUE 
## 21089   730

3.5 QC metrics

qc_vars <- c(
  "nCount_peaks",
  "nFeature_peaks",
  "nucleosome_signal",
  "TSS.enrichment"
)
qc_gg <- purrr::map(qc_vars, function(x) {
  p <- FeaturePlot(seurat, features = x, max.cutoff = "q95")
  p
})
qc_gg
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

3.6 Chromatin Signature

qc_vars <- c("NBC.MBC", "GCBC", "PC", "CD4.T", "Cytotoxic")
qc_gg <- purrr::map(qc_vars, function(x) {
  p <- FeaturePlot(seurat, feature = x, 
                   max.cutoff = 4, min.cutoff = -4) + 
    scale_color_viridis_c(option = "magma")
  p
})
qc_gg
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

4 Clustering

resolutions <- c(0.01, 0.025, 0.05, 0.1)
seurat <- FindClusters(seurat, resolution = resolutions)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21819
## Number of edges: 769264
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9904
## Number of communities: 4
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21819
## Number of edges: 769264
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9764
## Number of communities: 4
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21819
## Number of edges: 769264
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9541
## Number of communities: 5
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21819
## Number of edges: 769264
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9250
## Number of communities: 6
## Elapsed time: 3 seconds
vars <- str_c("peaks_macs_snn_res.", resolutions)
umap_clusters <- purrr::map(vars, function(x) {
  p <- DimPlot(seurat, group.by = x, cols = color_palette)
  p
})
umap_clusters
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

4.1 Proportion of doublets per cluster based on Scrublet

doublet_clusters <- purrr::map(vars, function(x) {
  df1 <- data.frame(table(seurat@meta.data[,x], seurat@meta.data$scrublet_predicted_doublet_atac))
  colnames(df1) <- c("Cluster", "Scrublet","Cells")
  p <- ggbarplot(df1, "Cluster", "Cells",
  fill = "Scrublet", color = "Scrublet",
  label = TRUE,
  position = position_dodge(0.9))
  p
})
doublet_clusters
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

4.2 Proportion of doublets per cluster based on Multiome

doublet_clusters <- purrr::map(vars, function(x) {
  df1 <- data.frame(table(seurat@meta.data[,x], seurat$quality_cells))
  colnames(df1) <- c("Cluster", "Quality","Cells")
  p <- ggbarplot(df1, "Cluster", "Cells",
  fill = "Quality",
  label = TRUE,
  position = position_dodge(0.9))
  p
})
doublet_clusters
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

4.3 UMAP level 1

umap_clusters_level1 <- purrr::map(vars, function(x) {
  p <- FeatureScatter(seurat, 
                      "UMAP_1_level_1",
                      "UMAP_2_level_1", group.by = x, cols = color_palette)
  p
})
umap_clusters_level1
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

4.4 UMAP level 1

umap_level_1 <- FeatureScatter(
  seurat,
  "UMAP_1_level_1",
  "UMAP_2_level_1",
  group.by = "annotation_level_1"
)
umap_level_1 <- umap_level_1 +
  theme(
    #legend.position = "none",
    plot.title = element_blank()
  )
umap_level_1

5 Exclude problematic clusters and poor quality cells.

cluster_to_exclude <- c("1","2")
'%ni%' <- Negate('%in%')

selected_cells_clusters <- colnames(seurat)[!(seurat$peaks_macs_snn_res.0.05 %in% cluster_to_exclude)]
selected_cells <- selected_cells_clusters[selected_cells_clusters %ni% metabolic_poor_quality]

length(selected_cells)
## [1] 19904
seurat <- subset(seurat, 
                 cells = selected_cells, 
                 quality_cells == "Good-quality")
seurat
## An object of class Seurat 
## 270784 features across 19321 samples within 1 assay 
## Active assay: peaks_macs (270784 features, 210444 variable features)
##  3 dimensional reductions calculated: umap, lsi, harmony

5.1 Visualization after removing problematic cells

DimPlot(seurat)

6 Integration

seurat <- seurat %>%
  RunTFIDF() %>%
  FindTopFeatures(min.cutoff = 10) %>%
  RunSVD()

DepthCor(seurat)

seurat <- RunUMAP(object = seurat, reduction = 'lsi', dims = 2:40)
DimPlot(seurat)

seurat <- RunHarmony(
  object = seurat,
  dims = 2:40,
  group.by.vars = 'assay',
  reduction = 'lsi',
  assay.use = 'peaks_macs',
  project.dim = FALSE,
  max.iter.harmony = 20
)

seurat <- RunUMAP(seurat, reduction = "harmony", dims = 2:40)
seurat <- FindNeighbors(seurat, reduction = "harmony", dims = 2:40)
DimPlot(seurat, cols = color_palette, pt.size = 0.2)

7 Save

saveRDS(seurat, path_to_save)

8 Session Information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] harmony_1.0       Rcpp_1.0.5        ggpubr_0.4.0      reshape2_1.4.4    forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4       readr_1.4.0       tidyr_1.1.2       tibble_3.0.4      ggplot2_3.3.2     tidyverse_1.3.0   Signac_1.1.0.9000 Seurat_3.9.9.9010 BiocStyle_2.16.1 
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.1              SnowballC_0.7.0             rtracklayer_1.48.0          GGally_2.0.0                bit64_4.0.5                 knitr_1.30                  irlba_2.3.3                 DelayedArray_0.14.0         data.table_1.13.2           rpart_4.1-15                RCurl_1.98-1.2              AnnotationFilter_1.12.0     generics_0.0.2              BiocGenerics_0.34.0         GenomicFeatures_1.40.1      cowplot_1.1.0               RSQLite_2.2.1               RANN_2.6.1                  future_1.19.1               bit_4.0.4                   spatstat.data_1.4-3         xml2_1.3.2                  lubridate_1.7.9             httpuv_1.5.4                SummarizedExperiment_1.18.1 assertthat_0.2.1            xfun_0.18                   hms_0.5.3                   evaluate_0.14               promises_1.1.1              fansi_0.4.1                 progress_1.2.2              dbplyr_1.4.4                readxl_1.3.1                igraph_1.2.6                DBI_1.1.0                   htmlwidgets_1.5.2           reshape_0.8.8               stats4_4.0.3                ellipsis_0.3.1              RSpectra_0.16-0             backports_1.1.10           
##  [43] bookdown_0.21               biomaRt_2.44.4              deldir_0.1-29               vctrs_0.3.4                 Biobase_2.48.0              here_1.0.1                  ensembldb_2.12.1            ROCR_1.0-11                 abind_1.4-5                 withr_2.3.0                 ggforce_0.3.2               BSgenome_1.56.0             checkmate_2.0.0             sctransform_0.3.1           GenomicAlignments_1.24.0    prettyunits_1.1.1           goftest_1.2-2               cluster_2.1.0               lazyeval_0.2.2              crayon_1.3.4                labeling_0.4.2              pkgconfig_2.0.3             tweenr_1.0.1                GenomeInfoDb_1.24.0         nlme_3.1-150                ProtGenerics_1.20.0         nnet_7.3-14                 rlang_0.4.7                 globals_0.13.1              lifecycle_0.2.0             miniUI_0.1.1.1              BiocFileCache_1.12.1        modelr_0.1.8                rsvd_1.0.3                  dichromat_2.0-0             rprojroot_2.0.2             cellranger_1.1.0            polyclip_1.10-0             matrixStats_0.57.0          lmtest_0.9-38               graph_1.66.0                Matrix_1.2-18              
##  [85] ggseqlogo_0.1               carData_3.0-4               zoo_1.8-8                   reprex_0.3.0                base64enc_0.1-3             ggridges_0.5.2              png_0.1-7                   viridisLite_0.3.0           bitops_1.0-6                KernSmooth_2.23-17          Biostrings_2.56.0           blob_1.2.1                  jpeg_0.1-8.1                rstatix_0.6.0               S4Vectors_0.26.0            ggsignif_0.6.0              scales_1.1.1                memoise_1.1.0               magrittr_1.5                plyr_1.8.6                  ica_1.0-2                   zlibbioc_1.34.0             compiler_4.0.3              RColorBrewer_1.1-2          fitdistrplus_1.1-1          Rsamtools_2.4.0             cli_2.1.0                   XVector_0.28.0              listenv_0.8.0               patchwork_1.1.0             pbapply_1.4-3               htmlTable_2.1.0             Formula_1.2-4               MASS_7.3-53                 mgcv_1.8-33                 tidyselect_1.1.0            stringi_1.5.3               yaml_2.2.1                  askpass_1.1                 latticeExtra_0.6-29         ggrepel_0.8.2               grid_4.0.3                 
## [127] VariantAnnotation_1.34.0    fastmatch_1.1-0             tools_4.0.3                 future.apply_1.6.0          parallel_4.0.3              rio_0.5.16                  rstudioapi_0.11             foreign_0.8-80              lsa_0.73.2                  gridExtra_2.3               farver_2.0.3                Rtsne_0.15                  digest_0.6.27               BiocManager_1.30.10         shiny_1.5.0                 GenomicRanges_1.40.0        car_3.0-10                  broom_0.7.2                 later_1.1.0.1               RcppAnnoy_0.0.16            OrganismDbi_1.30.0          httr_1.4.2                  AnnotationDbi_1.50.3        ggbio_1.36.0                biovizBase_1.36.0           colorspace_1.4-1            rvest_0.3.6                 XML_3.99-0.3                fs_1.5.0                    tensor_1.5                  reticulate_1.18             IRanges_2.22.1              splines_4.0.3               uwot_0.1.8                  RBGL_1.64.0                 RcppRoll_0.3.0              spatstat.utils_1.17-0       plotly_4.9.2.1              xtable_1.8-4                jsonlite_1.7.1              spatstat_1.64-1             R6_2.4.1                   
## [169] Hmisc_4.4-1                 pillar_1.4.6                htmltools_0.5.0             mime_0.9                    glue_1.4.2                  fastmap_1.0.1               BiocParallel_1.22.0         codetools_0.2-17            lattice_0.20-41             curl_4.3                    leiden_0.3.5                zip_2.1.1                   openxlsx_4.2.3              openssl_1.4.3               survival_3.2-7              rmarkdown_2.5               munsell_0.5.0               GenomeInfoDbData_1.2.3      haven_2.3.1                 gtable_0.3.0